Affine

将来自输入数据的若干行按照一个“context”(偏移集合)拼接/裁切成一个中间矩阵,再对该中间矩阵与权重做一次矩阵乘和偏置加法操作,最后应用激活函数(如果指定),得到输出结果。

该算子支持全量运行和增量运行两种模式,并维护了上一次全窗口的输出(previous_output)以支持增量更新。

输入:
  • input0 - 输入数据张量地址。

  • input1 - 权重矩阵地址。

  • input2 - 偏置向量地址。

  • input0_shape - 输入数据形状数组,长度为3,第一维值为1。

  • input1_shape - 权重矩阵形状数组,长度为3,第一维值为1。

  • input2_shape - 偏置向量形状数组,长度为3,第一维值为1。

  • output_shape - 输出张量形状数组,长度为3,第一维值为1。

  • context - 上下文索引数组,值递增。

  • context_size - 上下文大小。

  • output_dim - 输出维度,即输入张量最后一维大小乘以上下文大小。

  • activation_type - 激活函数类型,0-8。

  • is_full_run - 全量运行标志指针,所指值为1时为全量更新(运行后修改为0),为0时为增量更新。

  • full_input - 全量输入缓冲区地址。

  • full_input_shape - 全量输入形状数组。

  • increment_input - 增量输入缓冲区地址。

  • increment_input_shape - 增量输入形状数组。

  • increment_output - 增量输出缓冲区地址。

  • increment_output_shape - 增量输出形状数组。

  • previous_output - 先前输出缓冲区地址。

  • previous_output_shape - 先前输出形状数组。

  • core_mask(int, 可选) - 核掩码(仅适用于共享存储版本)。

输出:
  • output - 仿射变换结果张量地址。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持的数据类型:int8, fp32

  • MT7004 支持的数据类型:fp16, fp32

激活函数类型定义:

1#define ActivationType_NO_ACTIVATION 0   // 无激活函数
2#define ActivationType_RELU          1   // ReLU激活函数
3#define ActivationType_RELU6         2   // ReLU6激活函数
4#define ActivationType_SIGMOID       3   // Sigmoid激活函数
5#define ActivationType_TANH          4   // Tanh激活函数
6#define ActivationType_SWISH         5   // Swish激活函数
7#define ActivationType_HSWISH        6   // Hard Swish激活函数
8#define ActivationType_HSIGMOID      7   // Hard Sigmoid激活函数
9#define ActivationType_SOFTPLUS      8   // Softplus激活函数

激活函数数学公式:

  • ReLU: \(f(x) = \max(0, x)\)

  • ReLU6: \(f(x) = \min(\max(0, x), 6)\)

  • Sigmoid: \(f(x) = \frac{1}{1 + e^{-x}}\)

  • Tanh: \(f(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}\)

  • Swish: \(f(x) = x \cdot \sigma(x) = \frac{x}{1 + e^{-x}}\)

  • Hard Swish: \(f(x) = x \cdot \frac{\min(\max(x + 3, 0), 6)}{6}\)

  • Hard Sigmoid: \(f(x) = \frac{\min(\max(x + 3, 0), 6)}{6}\)

  • Softplus: \(f(x) = ln(1 + e^x)\)

参数数组结构:

 1long long params[21];
 2params[0] = (long long)input0;                    // 输入数据张量地址
 3params[1] = (long long)input1;                    // 权重矩阵地址
 4params[2] = (long long)input2;                    // 偏置向量地址
 5params[3] = (long long)output;                    // 输出张量地址
 6params[4] = (long long)input0_shape;              // 输入数据形状数组
 7params[5] = (long long)input1_shape;              // 权重矩阵形状数组
 8params[6] = (long long)input2_shape;              // 偏置向量形状数组
 9params[7] = (long long)output_shape;              // 输出张量形状数组
10params[8] = (long long)context;                   // 上下文索引数组
11params[9] = (long long)context_size;              // 上下文大小
12params[10] = (long long)output_dim;               // 输出维度
13params[11] = (long long)activation_type;          // 激活函数类型
14params[12] = (long long)&is_full_run;             // 全量运行标志指针
15params[13] = (long long)full_input;               // 全量输入缓冲区地址
16params[14] = (long long)full_input_shape;         // 全量输入形状数组
17params[15] = (long long)increment_input;          // 增量输入缓冲区地址
18params[16] = (long long)increment_input_shape;    // 增量输入形状数组
19params[17] = (long long)increment_output;         // 增量输出缓冲区地址
20params[18] = (long long)increment_output_shape;   // 增量输出形状数组
21params[19] = (long long)previous_output;          // 先前输出缓冲区地址
22params[20] = (long long)previous_output_shape;    // 先前输出形状数组

共享存储版本:

void i8_affine_s(long long *params, int core_mask)
void fp_affine_s(long long *params, int core_mask)
void hp_affine_s(long long *params, int core_mask)

C调用示例:

 1// FT78NE 多核示例
 2#include <stdio.h>
 3#include <stdlib.h>
 4#include <time.h>
 5#include <affine.h>
 6
 7void test_fp_affine_s(int a, int b, int c, int o, int activation_type, int full_run, int core_mask) {
 8    int i = 0, j = 0;
 9    srand(time(0));
10
11    int core_id = DNUM;
12    int logic_core_id = GetLogicCoreId(core_mask, core_id);
13    int num = GetCoreNum(core_mask);
14
15    int is_full_run = full_run;
16    int context[] = {-1, 0, 1, 2};
17    int context_size = c;
18    int output_dim = b * c;
19
20    // 形状定义
21    int input0_shape[3] = {1, a, b};
22    int input1_shape[3] = {1, b * c, o};
23    int input2_shape[3] = {1, a - c + 1, o};
24    int output_shape[3] = {1, a - c + 1, o};
25
26    // 中间缓冲区形状
27    int full_input_shape[3] = {1, input0_shape[1] - (context[context_size - 1] - context[0]), output_dim};
28    int increment_input_shape[3] = {1, 1, output_dim};
29    int increment_output_shape[3] = {1, 1, output_shape[2]};
30    int previous_output_shape[3] = {1, output_shape[1], output_shape[2]};
31
32    // 内存分配
33    float* input0 = (float*)(0xA0400000);
34    float* input1 = (float*)(0xA0400000 + 0x100000);
35    float* input2 = (float*)(0xA0400000 + 0x200000);
36    float* output = (float*)(0xA0400000 + 0x300000);
37    float* full_input = (float*)(0xA0400000 + 0x400000);
38    float* increment_input = (float*)(0xA0400000 + 0x500000);
39    float* increment_output = (float*)(0xA0400000 + 0x600000);
40    float* previous_output = (float*)(0xA0400000 + 0x700000);
41
42    // 初始化数据
43    if (logic_core_id == 0) {
44        int input0_len = input0_shape[0] * input0_shape[1] * input0_shape[2];
45        int input1_len = input1_shape[0] * input1_shape[1] * input1_shape[2];
46        int input2_len = input2_shape[0] * input2_shape[1] * input2_shape[2];
47
48        for (i = 0; i < input0_len; i++) {
49            input0[i] = ((float)rand() / RAND_MAX) * 2 - 1;
50        }
51        for (i = 0; i < input1_len; i++) {
52            input1[i] = ((float)rand() / RAND_MAX) * 2 - 1;
53        }
54        for (i = 0; i < input2_shape[2]; i++) {
55            input2[i] = ((float)rand() / RAND_MAX) * 2 - 1;
56            for (j = 1; j < input2_shape[1]; j++) {
57                input2[i + j * input2_shape[2]] = input2[i];
58            }
59        }
60    }
61
62    // 准备参数数组
63    long long params[21];
64    params[0] = (long long)input0;
65    params[1] = (long long)input1;
66    params[2] = (long long)input2;
67    params[3] = (long long)output;
68    params[4] = (long long)input0_shape;
69    params[5] = (long long)input1_shape;
70    params[6] = (long long)input2_shape;
71    params[7] = (long long)output_shape;
72    params[8] = (long long)context;
73    params[9] = (long long)context_size;
74    params[10] = (long long)output_dim;
75    params[11] = (long long)activation_type;
76    params[12] = (long long)&is_full_run;
77    params[13] = (long long)full_input;
78    params[14] = (long long)full_input_shape;
79    params[15] = (long long)increment_input;
80    params[16] = (long long)increment_input_shape;
81    params[17] = (long long)increment_output;
82    params[18] = (long long)increment_output_shape;
83    params[19] = (long long)previous_output;
84    params[20] = (long long)previous_output_shape;
85
86    // 执行 Affine 操作
87    fp_affine_s(params, core_mask);
88}
89
90int main(void) {
91    int a = 23, b = 31, c = 4, o = 29;
92    int activation_type = 0;  // 激活函数类型
93    int full_run = 1;         // 全量运行标志
94    int core_mask = 0xff;     // 核掩码
95
96    test_fp_affine_s(a, b, c, o, activation_type, full_run, core_mask);
97    return 0;
98}

私有存储版本:

void i8_affine_p(long long *params)
void fp_affine_p(long long *params)
void hp_affine_p(long long *params)

C调用示例:

 1// FT78NE 单核示例
 2#include <stdio.h>
 3#include <affine.h>
 4
 5int main(void) {
 6    // 参数设置(与共享版本类似)
 7    int a = 32, b = 16, c = 4, o = 16;
 8    int is_full_run = full_run;
 9    int context[] = {-1, 0, 1, 2};
10    int context_size = c;
11    int output_dim = b * c;
12
13    int input0_shape[3] = {1, a, b};
14    int input1_shape[3] = {1, b * c, o};
15    int input2_shape[3] = {1, a - c + 1, o};
16    int output_shape[3] = {1, a - c + 1, o};
17
18    int full_input_shape[3] = {1, input0_shape[1] - (context[context_size - 1] - context[0]), output_dim};
19    int increment_input_shape[3] = {1, 1, output_dim};
20    int increment_output_shape[3] = {1, 1, output_shape[2]};
21    int previous_output_shape[3] = {1, output_shape[1], output_shape[2]};
22
23    float* input0 = (float*)(0x10810000);
24    float* input1 = (float*)(0x10810000 + 0x100000);
25    float* input2 = (float*)(0x10810000 + 0x200000);
26    float* output = (float*)(0x10810000 + 0x300000);
27    float* full_input = (float*)(0x10810000 + 0x400000);
28    float* increment_input = (float*)(0x10810000 + 0x500000);
29    float* increment_output = (float*)(0x10810000 + 0x600000);
30    float* previous_output = (float*)(0x10810000 + 0x700000);
31
32    // 准备参数数组(与共享版本相同)
33    long long params[21];
34    params[0] = (long long)input0;
35    params[1] = (long long)input1;
36    params[2] = (long long)input2;
37    params[3] = (long long)output;
38    params[4] = (long long)input0_shape;
39    params[5] = (long long)input1_shape;
40    params[6] = (long long)input2_shape;
41    params[7] = (long long)output_shape;
42    params[8] = (long long)context;
43    params[9] = (long long)context_size;
44    params[10] = (long long)output_dim;
45    params[11] = (long long)activation_type;
46    params[12] = (long long)&is_full_run;
47    params[13] = (long long)full_input;
48    params[14] = (long long)full_input_shape;
49    params[15] = (long long)increment_input;
50    params[16] = (long long)increment_input_shape;
51    params[17] = (long long)increment_output;
52    params[18] = (long long)increment_output_shape;
53    params[19] = (long long)previous_output;
54    params[20] = (long long)previous_output_shape;
55
56    // 调用 Affine
57    fp_affine_p(params);
58    return 0;
59}